# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import re
import pickle
import os
path_dir = os.path.dirname(os.getcwd())
import plotly
plotly.offline.init_notebook_mode()
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
pio.renderers.default='notebook'
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
cd ../src/
/Users/linafaik/Documents/survival_analysis/src
from train import *
from train_survival_ml import *
from train_survival_deep import *
df = pd.read_csv(os.path.join(path_dir, "outputs/customer_subscription_clean.csv"))
# Parameters
scaler_name = "StandardScaler" #MinMaxScaler
random_state = 123
# covariate columns (used when possible)
cols_x = [
'price', 'billing_cycle', 'age',
'product=prd_1', 'gender=female', 'channel=email', 'reason=support',
'nb_cases', 'time_since_signup',
'date_month_cos', 'date_month_sin',
'date_weekday_cos', 'date_weekday_sin', 'date_hour_cos',
'date_hour_sin'
]
col_target = "duration"
Xy_train, Xy_test, y_train, y_test = split_train_test(
df, cols_x, col_target, test_size=0.15, col_stratify= "censored", random_state=random_state)
Xy_train, Xy_val, y_train, y_val = split_train_test(
Xy_train, cols_x, col_target, test_size=0.2, col_stratify= "censored", random_state=random_state)
n_train, n_test, n_val = Xy_train.shape[0], Xy_test.shape[0], Xy_val.shape[0]
n_tot = n_train + n_test + n_val
print("Train: {}%, Test: {}%, Val: {}%".format(
round(n_train/n_tot *100),
round(n_test/n_tot *100),
round(n_val/n_tot *100)
))
Train: 68%, Test: 15%, Val: 17%
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# rescale
scaler = eval(scaler_name)()
Xy_train[cols_x] = scaler.fit_transform(Xy_train[cols_x])
Xy_test[cols_x] = scaler.transform(Xy_test[cols_x])
from sksurv.nonparametric import kaplan_meier_estimator
time, probas = kaplan_meier_estimator(Xy_train["censored"].astype(bool), Xy_train[col_target])
fig = px.line(x=time, y=probas, width=800, height = 400)
fig.update_layout(dict(xaxis={'title' : 'Time (# days)'}, yaxis={'title' : 'Survival probability'}))
from sksurv.nonparametric import kaplan_meier_estimator
col = "product=prd_1"
for i, v in enumerate(df[col].unique()):
Xy_train_filter = df[df[col] == v]
time_prd, probas_prd = kaplan_meier_estimator(
Xy_train_filter["censored"].astype(bool), Xy_train_filter[col_target])
proba_df = pd.DataFrame(
{'time': time_prd, col : v,
'proba': probas_prd}
)
preds = proba_df if i ==0 else pd.concat([proba_df, preds], axis=0)
preds.head()
| time | product=prd_1 | proba | |
|---|---|---|---|
| 0 | 1.0 | 0 | 0.998538 |
| 1 | 2.0 | 0 | 0.997063 |
| 2 | 3.0 | 0 | 0.995625 |
| 3 | 4.0 | 0 | 0.994052 |
| 4 | 5.0 | 0 | 0.992715 |
# product 1 = annual subscription
# product 2 = monthly subscription
map_product = {1: "annual subscription", 0:"monthly subscription"}
data = [
go.Scatter(
x=time,y=probas,
line_color = 'grey',
name = "all products")
]
data += [
go.Scatter(
x=preds[preds[col] == v].time,
y=preds[preds[col] == v].proba,
#name = '{} {}'.format(col, v)
name = map_product[v]
) for v in preds[col].unique()
]
data += [
go.Scatter(
x=[365*i, 365*i], y=[0, 1],
name=f"year {i}", line_color = 'lightgrey')
for i in range(1, 5)
]
fig = go.Figure(data)
fig
fig.update_layout(
dict(
width=800, height = 400,
xaxis={'title' : 'nb days'},
yaxis={'title' : 'proba'}
)
)
from sksurv.linear_model import CoxPHSurvivalAnalysis
# train an estimator
estimator = CoxPHSurvivalAnalysis(alpha=0.5)
estimator = estimator.fit(Xy_train[cols_x], y_train)
# number of customers
N = 5
# predict cumulative hazard function
rnd_idx = np.random.choice(Xy_test.index, N)
rnd_idx = [194386, 279993, 174244, 239372, 185588]
chf_funcs = estimator.predict_cumulative_hazard_function(Xy_test[cols_x].loc[rnd_idx])
data = [
go.Scatter(
x=fn.x,y= fn(fn.x),
name='customer {}'.format(i+1))
for i, fn in enumerate(chf_funcs)
]
data += [
go.Scatter(
x=[365*i, 365*i], y=[0, 1],
name=f"year {i}", line_color = 'lightgrey')
for i in range(1, 5)
]
fig = go.Figure(data, layout=dict(width=800, height=400))
fig.update_layout({"yaxis":{"range": [0,1]}})
# predict survival function
surv_funcs = estimator.predict_survival_function(Xy_test[cols_x].loc[rnd_idx])
# plot results
data = [
go.Scatter(
x=fn.x,y= fn(fn.x),
name='customer {}'.format(i+1))
for i, fn in enumerate(surv_funcs)
]
data += [
go.Scatter(
x=[365*i, 365*i], y=[0, 1],
name=f"year {i}", line_color = 'lightgrey')
for i in range(1, 5)
]
go.Figure(data, layout=dict(width=800, height=400))
feat_importance, fig = plot_feat_imp(cols_x, estimator.coef_)
fig.show()
from sksurv.metrics import concordance_index_censored
prediction = estimator.predict(Xy_test[cols_x])
result = concordance_index_censored(list(Xy_test.censored.astype(bool)), Xy_test[col_target], prediction)
result
# c-index, concordant, discordant, tied_risk, tied_time
(0.6812899789827326, 603082578, 282117916, 24006, 352836)
from sksurv.metrics import concordance_index_ipcw
result = concordance_index_ipcw(y_train, y_test, prediction)
result
# c-index, concordant, discordant, tied_risk, tied_time
(0.6639232584980025, 603082578, 282117916, 24006, 352836)
from sksurv.metrics import cumulative_dynamic_auc
risk_score = estimator.predict(Xy_test[cols_x])
#times = np.percentile(df[col_target], np.linspace(5, 81, 15))
times = np.arange(1, 365+1)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)
mean_auc
0.7854491726177352
times = np.arange(365+1, 365*2+1)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)
mean_auc
0.7176921175567605
times = np.percentile(df[col_target], np.linspace(5, 81, 15))
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)
fig = px.line(x=times, y= auc, width=800, height=400)
fig = fig.add_traces([
go.Scatter(
x=[365*i, 365*i], y=[0, 1],
name=f"year {i}", line_color = 'lightgrey')
for i in range(1, int(max(times)/365)+1)
])
fig.update_layout({
"xaxis": dict(title = "Time (#days)"),
"yaxis": dict(title = "Time-dependent AUC")
})
from sksurv.metrics import brier_score, integrated_brier_score
survs = estimator.predict_survival_function(Xy_test[cols_x])
T = 364/2
preds = [fn(T) for fn in survs]
times, score = brier_score(y_train, y_test, preds, T)
score
array([0.13207416])
times = np.arange(1, 365)
preds = np.row_stack([ fn(times) for fn in survs])
score = integrated_brier_score(y_train, y_test, preds, times)
print(score)
times = np.arange(1, 365)
survs = estimator.predict_survival_function(Xy_test[cols_x])
get_bier_score(Xy_test, y_train, y_test, survs, times)
{'estimator': 0.12473187839495915,
'random': 0.25001400864150725,
'kaplan_meier': 0.1475128902478881}
times = np.arange(1, 365*3)
survs = estimator.predict_survival_function(Xy_test[cols_x])
get_bier_score(Xy_test, y_train, y_test, survs, times)
{'estimator': 0.169087465222658,
'random': 0.2501781867945744,
'kaplan_meier': 0.19564116760972858}
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=random_state, shuffle=True)
from sksurv.linear_model import CoxnetSurvivalAnalysis
grid_params = {
'l1_ratio': [0.1, 0.5, 0.9, 1],
'alpha_min_ratio': [0.01],
}
estimator_cox, results = grid_search(
grid_params, Xy_train, cv, CoxnetSurvivalAnalysis, cols_x, col_target, verbose = True)
4 total scenario to run
1/4: params: {'l1_ratio': 0.1, 'alpha_min_ratio': 0.01}
Fold 0: 0.682
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
2/4: params: {'l1_ratio': 0.5, 'alpha_min_ratio': 0.01}
Fold 0: 0.681
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
3/4: params: {'l1_ratio': 0.9, 'alpha_min_ratio': 0.01}
Fold 0: 0.681
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
4/4: params: {'l1_ratio': 1, 'alpha_min_ratio': 0.01}
Fold 0: 0.681
Fold 1: 0.682
Fold 2: 0.682
Fold 3: 0.681
Fold 4: 0.682
results
| l1_ratio | alpha_min_ratio | fold_0 | fold_1 | fold_2 | fold_3 | fold_4 | time | mean | std | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.1 | 0.01 | 0.681810 | 0.682205 | 0.681779 | 0.681432 | 0.682394 | 0.959791 | 0.681924 | 0.000379 |
| 1 | 0.5 | 0.01 | 0.681438 | 0.682018 | 0.681717 | 0.681297 | 0.682325 | 1.437839 | 0.681759 | 0.000420 |
| 2 | 0.9 | 0.01 | 0.681468 | 0.681959 | 0.681534 | 0.681278 | 0.682319 | 0.891154 | 0.681712 | 0.000421 |
| 3 | 1.0 | 0.01 | 0.681468 | 0.681991 | 0.681521 | 0.681277 | 0.682321 | 0.943610 | 0.681716 | 0.000428 |
estimator_cox.score(Xy_val[cols_x], y_val)
0.6676465625254092
with open(os.path.join(path_dir, "outputs/cox_ph.pkl"), "wb") as f:
pickle.dump(estimator_cox, f)
feat_importance_cox, _ = plot_feat_imp(cols_x, estimator_cox.coef_[:,-1])
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
grid_params = {
"n_estimators": [20*i for i in range(1,6)],
"max_depth": [2],
#"min_samples_leaf": [2, 3],
"learning_rate": [0.1],
#"subsample": [0.5],
"random_state": [random_state],
"verbose":[1]}
estimator_gb, results = grid_search(
grid_params, Xy_train, cv, GradientBoostingSurvivalAnalysis,
cols_x, col_target, verbose = True)
results
<frozen importlib._bootstrap>:228: RuntimeWarning: sklearn.tree._criterion.Criterion size changed, may indicate binary incompatibility. Expected 328 from C header, got 528 from PyObject <frozen importlib._bootstrap>:228: RuntimeWarning: sklearn.tree._splitter.Splitter size changed, may indicate binary incompatibility. Expected 1160 from C header, got 1360 from PyObject <frozen importlib._bootstrap>:228: RuntimeWarning: sklearn.tree._criterion.ClassificationCriterion size changed, may indicate binary incompatibility. Expected 1168 from C header, got 1368 from PyObject <frozen importlib._bootstrap>:228: RuntimeWarning: sklearn.tree._criterion.RegressionCriterion size changed, may indicate binary incompatibility. Expected 960 from C header, got 1160 from PyObject
5 total scenario to run
1/5: params: {'n_estimators': 20, 'max_depth': 2, 'learning_rate': 0.1, 'random_state': 123, 'verbose': 1}
Iter Train Loss Remaining Time
1 1518066.9232 152.69m
2 1516536.4925 144.96m
3 1515189.8359 181.59m
estimator_gb.score(Xy_val[cols_x], y_val)
feat_importance_gb, fig = plot_feat_imp(cols_x, estimator_gb.feature_importances_)
fig
with open(os.path.join(path_dir, "outputs/gradient_boosting.pkl"), "wb") as f:
pickle.dump(estimator_gb, f)
from sksurv.svm import FastSurvivalSVM
from sksurv.svm import FastSurvivalSVM
grid_params = {
"alpha": [1,2, 5, 10],
"rank_ratio": [0],
"max_iter": [1000],
"tol": [1e-5],
"random_state": [random_state],
"verbose":[0]}
estimator_svm, results = grid_search(grid_params, df, cv, FastSurvivalSVM, cols_x, col_target, verbose = True)
results
with open(os.path.join(path_dir, "outputs/svm.pkl"), "wb") as f:
pickle.dump(estimator_svm, f)
from sksurv.svm import FastKernelSurvivalSVM
grid_params = {
"kernel": ["linear","poly","rbf","sigmoid","cosine"],
"alpha": [2],
"rank_ratio": [0],
"max_iter": [1000],
"tol": [1e-5],
"random_state": [random_state]
}
estimator_ksvm, results = grid_search(grid_params, df, cv, FastKernelSurvivalSVM, cols_x, col_target, verbose = True)
results
with open(os.path.join(path_dir, "outputs/ksvm.pkl"), "wb") as f:
pickle.dump(estimator_ksvm, f)